{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TASK 2 (2017-2019) - Interaction Variable\n",
    "\n",
    "\n",
    "import sys\n",
    "import os\n",
    "import logging\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from datetime import datetime\n",
    "\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from joblib import dump, load\n",
    "\n",
    "import views\n",
    "from views import Ensemble, Model, Downsampling, Period\n",
    "from views.utils.data import assign_into_df\n",
    "from views.apps.transforms import lib as translib\n",
    "from views.apps.evaluation import lib as evallib, feature_importance as fi\n",
    "from views.apps.model import api\n",
    "from views.apps.extras import extras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = views.DATASETS[\"cm_africa_imp_0\"]\n",
    "df = dataset.df\n",
    "#print(df)\n",
    "level = \"cm\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_path = \"./models/eval_real/{sub}\"\n",
    "out_paths = {\n",
    "    \"evaluation2\": model_path.format(sub=\"evaluation2\"),\n",
    "    \"features2\": model_path.format(sub=\"features2\")\n",
    "}\n",
    "for k, v in out_paths.items():\n",
    "    if not os.path.isdir(v):\n",
    "        os.makedirs(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import our data\n",
    "wiki = pd.read_csv(r'/home/default/Dokumente/TRINITY/comp/prediction-project/data/Wiki_Data_Final_2021.csv')\n",
    "multi = wiki.set_index(['month_id', 'country_id'])\n",
    "multi = multi.sort_index(level=0)\n",
    "\n",
    "df = df.merge(multi, on=['month_id', 'country_id'], how='left')\n",
    "df = df.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create interaction variables\n",
    "df['ENsub']= df['viewsEnglish']*df['ENspeaking']\n",
    "df['FRsub']= df['viewsFrench']*df['FRspeaking']\n",
    "df['ArabSpring_EN']= df['viewsEnglish']*df['ArabSpring']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Partition Dataset in Training, Test and Calibration set\n",
    "period_calib = api.Period(\n",
    "   name=\"calib\", \n",
    "   train_start=121,   # 1990-01\n",
    "   train_end=408,     # 2013.12\n",
    "   predict_start=409, # 2014.01\n",
    "   predict_end=443,   # 2016.11\n",
    ")\n",
    "\n",
    "\n",
    "period_test = api.Period(\n",
    "   name=\"test\", \n",
    "   train_start=121,   # 1990-01\n",
    "   train_end=443,     # 2016.11\n",
    "   predict_start=445, # 2017.01\n",
    "   predict_end=480,   # 2019.12\n",
    ")\n",
    "\n",
    "\n",
    "periods = [period_calib, period_test]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The steps to train, predict and evaluate for.\n",
    "steps = [2,3,4,5,6,7]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Subset \"most important\" features from the ViEWS-dataset and Wikipedia variables\n",
    "\n",
    "cols_features = df[['ged_best_sb','wdi_vc_btl_deth', 'reign_precip','reign_prev_conflict','reign_tenure_months',\n",
    "           'reign_irregular','reign_lastelection','reign_loss','reign_pctile_risk','reign_couprisk',\n",
    "          'ged_count_sb','ged_best_os','ged_count_os','wdi_eg_use_pcap_kg_oe','ged_best_ns',\n",
    "           'vdem_v2x_accountability', 'wdi_nv_srv_totl_zs','wdi_sp_pop_totl','wdi_sl_tlf_totl_fe_zs',\n",
    "          'wdi_sm_pop_totl_zs', 'wdi_sm_pop_refg_or', 'wdi_dt_oda_odat_pc_zs', 'ged_count_ns', \n",
    "           'reign_gov_dominant_party', 'reign_age', 'vdem_v2xpe_exlpol', 'wdi_ag_lnd_frst_k2', \n",
    "           'wdi_bg_gsr_nfsv_gd_zs', 'wdi_st_int_rcpt_xp_zs', 'vdem_v2x_clpol', 'vdem_v2x_genpp',\n",
    "                   'ENsub', 'FRsub', 'ArabSpring_EN']]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Specify number of estimators in Random Forest\n",
    "n_estimators = 200"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Model\n",
    "\n",
    "task2_delta = api.Model(\n",
    "    name=\"task2_delta\",                \n",
    "    col_outcome=\"ln_ged_best_sb\",    \n",
    "    cols_features=cols_features,     \n",
    "    steps=steps,                     \n",
    "    outcome_type=\"real\",             \n",
    "    periods=periods,                 \n",
    "    estimator=RandomForestRegressor( \n",
    "        n_estimators=n_estimators,\n",
    "        criterion=\"mse\",\n",
    "        n_jobs=-1,\n",
    "    ),\n",
    "    delta_outcome=True,            \n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "models = [task2_delta]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 15min 2s, sys: 14.8 s, total: 15min 16s\n",
      "Wall time: 15min 11s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Train all models\n",
    "for model in models:\n",
    "    model.fit_estimators(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store predictions and calibrated predictions for all models in our dataframe\n",
    "for model in models:\n",
    "    df_predictions = model.predict(df)\n",
    "    df = assign_into_df(df, df_predictions)\n",
    "    df_predictions = model.predict_calibrated(\n",
    "        df=df,\n",
    "        period_calib = period_calib,\n",
    "        period_test = period_test\n",
    "    )\n",
    "    df = assign_into_df(df, df_predictions)\n",
    "  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate all models. Scores are stored in the model object\n",
    "for model in models:\n",
    "    model.evaluate(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wrote scores table to ./models/eval_real/evaluation2/task2_delta_cm_uncalibrated_scores.tex.\n"
     ]
    }
   ],
   "source": [
    "# Select test partition and compute model performance\n",
    "\n",
    "partition = \"test\"\n",
    "\n",
    "for model in models:\n",
    "    for calib in [\"uncalibrated\"]:\n",
    "        scores = {\n",
    "            \"Step\":[], \n",
    "            \"MSE\":[]\n",
    "        }\n",
    "        if model.delta_outcome:\n",
    "            scores.update({\"TADDA\":[]}) \n",
    "            \n",
    "        for key, value in model.scores[partition].items():\n",
    "            if key != \"sc\":\n",
    "                scores[\"Step\"].append(key)\n",
    "                scores[\"MSE\"].append(value[calib][\"mse\"])\n",
    "                if model.delta_outcome:\n",
    "                    scores[\"TADDA\"].append(value[calib][\"tadda_score\"])\n",
    "\n",
    "        out = pd.DataFrame(scores)\n",
    "        tex = out.to_latex(index=False)\n",
    "\n",
    "        # Add meta.\n",
    "        now = datetime.now().strftime(\"%Y/%m/%d %H:%M:%S\")\n",
    "        meta = f\"\"\"\n",
    "        %Output created by wb_models.ipynb.\n",
    "        %Evaluation of {model.col_outcome} per step.\n",
    "        %Run on selected {model.name} features at {level} level.\n",
    "        %Produced on {now}, written to {out_paths[\"evaluation2\"]}.\n",
    "        \\\\\n",
    "        \"\"\"\n",
    "        tex = meta + tex\n",
    "        path_out = os.path.join(\n",
    "            out_paths[\"evaluation2\"], \n",
    "            f\"{model.name}_{level}_{calib}_scores.tex\"\n",
    "        )\n",
    "        with open(path_out, \"w\") as f:\n",
    "            f.write(tex)\n",
    "        print(f\"Wrote scores table to {path_out}.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Step       MSE     TADDA\n",
      "0     2  0.708606  0.547618\n",
      "1     3  0.742311  0.553111\n",
      "2     4  0.719117  0.529548\n",
      "3     5  0.827056  0.580850\n",
      "4     6  0.808638  0.578684\n",
      "5     7  0.801089  0.566717\n"
     ]
    }
   ],
   "source": [
    "print(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
